Gender bias in audience of seminars and career position
Data
EcoEncontros Seminar talks
Talks from EcoEncontros Seminar series at the Graduate Program of Ecology in the University of SĂŁo Paulo (PPGE-USP), Brazil
See file metadata.txt, in folder data for more description and detail of the dataset.
data <- read.table("data/presentations_PPGE_2008-2019.csv", sep=",",
header=T, as.is=T)
data$date <- dmy(data$date)
data$year <- year(data$date)
#skimr::skim(data)Excluding special events as round tables and discussions not related to a project or study presented by someone.
IDs <- c(154, 250, 211, 289)
data <- data %>% filter(!id %in% IDs)For this specific analysis, excluding speakers that are not in academia (âothersâ), and keeping undergraduate students, MD and PhD in the group student. postdoc, professor or researcher*.
*Researchers are included in the professor categorical position (column position_cat) because all of them come from research institutions.
data <- data %>% filter(position_cat != "others")
data$position_cat <- fct_relevel(data$position_cat, "student",
"postdoc","professor")Excluding seminars with more than one speaker
events <- data %>% count(id) %>% filter(n>1)
data <- data %>% filter(!id %in% events$id,
!is.na(audience_n))dim(data)## [1] 299 30
Data description
Audience by gender and academic position
ggplot(data, aes(x=position_cat, y=audience_n, fill=gender)) +
scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
geom_boxplot() #geom_violin(position = position_dodge(0.8)) +
#geom_jitter(position=position_jitterdodge(0.2),shape=21)library(ggbeeswarm)
# outra opção
ggplot(data, aes(x=position_cat, y=audience_n, fill=gender)) +
scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
scale_color_manual(values = c("#b2abd2", "#fdb863"))+
geom_violin(col="black") +
geom_quasirandom(dodge.width = 0.9, shape=21)+
stat_summary(fun.y=median, aes(ymin=..y.., ymax=..y..),geom='errorbar',
width=0.8, size=0.8, position = position_dodge(width = 0.9))+
xlab("") + ylab("Audience (N)")Variation in time
ggplot(data, aes(x=date, y=audience_n, fill=gender)) +
facet_wrap(~position_cat, ncol=1)+
scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
scale_color_manual(values = c("#b2abd2", "#fdb863"))+
geom_quasirandom(dodge.width = 0.9, shape=21)+
geom_smooth()+
xlab("") + ylab("Audience (N)")Looking for possible biases for speakers from inside and outside PPGE.
data$ppge <- ifelse(data$origin == "IB", "inside", "outside")
table(data$gender,data$ppge)##
## inside outside
## F 76 49
## M 79 95
ggplot(data, aes(x=ppge, y=audience_n, fill=gender)) +
scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
scale_color_manual(values = c("#b2abd2", "#fdb863"))+
geom_violin(col="black") +
geom_quasirandom(dodge.width = 0.9, shape=21)+
stat_summary(fun.y=median, aes(ymin=..y.., ymax=..y..),geom='errorbar',
width=0.8, size=0.8, position = position_dodge(width = 0.9))+
xlab("PPGE") + ylab("Audience (N)")Looking for possible biases for speakers from Brazil and abroad.
data$brazilian <- ifelse(data$country == "Brasil", "yes", "no")
table(data$gender,data$brazilian)##
## no yes
## F 22 103
## M 50 124
ggplot(data, aes(x=brazilian, y=audience_n, fill=gender)) +
scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
scale_color_manual(values = c("#b2abd2", "#fdb863"))+
geom_violin(col="black") +
geom_quasirandom(dodge.width = 0.9, shape=21)+
stat_summary(fun.y=median, aes(ymin=..y.., ymax=..y..),geom='errorbar',
width=0.8, size=0.8, position = position_dodge(width = 0.9))+
xlab("Brazilian") + ylab("Audience (N)")Modeling
Negative binomial
data$affirm_action <- ifelse(data$year<2018,"before", "after")
data$affirm_action <- fct_relevel(data$affirm_action, "before", "after")mg0 <- glm.nb(audience_n~ 1, data=data)
mg1 <- glm.nb(audience_n~ gender, data=data)
mg2 <- glm.nb(audience_n~ position_cat, data=data)
mg3 <- glm.nb(audience_n~ year, data=data)
mg3b <- glm.nb(audience_n~ affirm_action, data=data)
mg4 <- glm.nb(audience_n~ gender + position_cat, data=data)
mg5 <- glm.nb(audience_n~ gender + year, data=data)
mg5b <- glm.nb(audience_n~ gender + affirm_action, data=data)
mg6 <- glm.nb(audience_n~ year + position_cat, data=data)
mg6b <- glm.nb(audience_n~ affirm_action + position_cat, data=data)
mg7 <- glm.nb(audience_n~ gender*position_cat, data=data)
mg8 <- glm.nb(audience_n~ gender*year, data=data)
mg8b <- glm.nb(audience_n~ gender*affirm_action, data=data)
mg9 <- glm.nb(audience_n~ year*position_cat, data=data)
mg9b <- glm.nb(audience_n~ affirm_action*position_cat, data=data)
mg10 <- glm.nb(audience_n~ gender + position_cat + year, data=data)
mg10b <- glm.nb(audience_n~ gender + position_cat + affirm_action, data=data)
mg11 <- glm.nb(audience_n~ gender*position_cat + year, data=data)
mg11b <- glm.nb(audience_n~ gender*position_cat + affirm_action, data=data)
mg12 <- glm.nb(audience_n~ gender + position_cat * year, data=data)
mg12b <- glm.nb(audience_n~ gender + position_cat * affirm_action, data=data)
mg13 <- glm.nb(audience_n~ gender*year + position_cat, data=data)
mg13b <- glm.nb(audience_n~ gender*affirm_action + position_cat, data=data)
mg14 <- glm.nb(audience_n~ gender*position_cat*year, data=data)
mg14b <- glm.nb(audience_n~ gender*position_cat*affirm_action, data=data)
AICtab(mg2, mg0,mg1, mg3, mg4,mg5,mg6,mg7,mg8,mg9,mg10,mg11,mg12,mg13,mg14,
mg3b,mg5b,mg6b,mg8b,mg9b,mg10b,mg11b,mg12b,mg13b, mg14b,
base=T, weights=T) %>% kable(digits=2)| AIC | dAIC | df | weight | |
|---|---|---|---|---|
| mg11b | 2168.07 | 0.00 | 8 | 0.48 |
| mg10b | 2169.26 | 1.19 | 6 | 0.26 |
| mg13b | 2171.26 | 3.19 | 7 | 0.10 |
| mg12b | 2172.47 | 4.41 | 8 | 0.05 |
| mg11 | 2173.77 | 5.70 | 8 | 0.03 |
| mg10 | 2174.21 | 6.14 | 6 | 0.02 |
| mg6b | 2174.87 | 6.81 | 5 | 0.02 |
| mg14b | 2175.67 | 7.60 | 13 | 0.01 |
| mg7 | 2175.79 | 7.72 | 7 | 0.01 |
| mg13 | 2176.19 | 8.12 | 7 | 0.01 |
| mg4 | 2176.31 | 8.24 | 5 | 0.01 |
| mg12 | 2177.82 | 9.75 | 8 | 0.00 |
| mg9b | 2178.54 | 10.47 | 7 | 0.00 |
| mg6 | 2179.00 | 10.93 | 5 | 0.00 |
| mg2 | 2181.06 | 13.00 | 4 | 0.00 |
| mg14 | 2182.06 | 14.00 | 13 | 0.00 |
| mg9 | 2182.24 | 14.17 | 7 | 0.00 |
| mg5b | 2202.12 | 34.05 | 4 | 0.00 |
| mg8b | 2203.83 | 35.76 | 5 | 0.00 |
| mg5 | 2205.11 | 37.05 | 4 | 0.00 |
| mg1 | 2206.19 | 38.13 | 3 | 0.00 |
| mg8 | 2207.09 | 39.03 | 5 | 0.00 |
| mg3b | 2219.15 | 51.08 | 3 | 0.00 |
| mg3 | 2220.65 | 52.59 | 3 | 0.00 |
| mg0 | 2221.46 | 53.40 | 2 | 0.00 |
Residual diagnostic
hnp::hnp(mg11b)## Negative binomial model (using MASS package)
plot(simulateResiduals(mg11b))hnp::hnp(mg10b)## Negative binomial model (using MASS package)
plot(simulateResiduals(mg10b))Models result
summary(mg11b)##
## Call:
## glm.nb(formula = audience_n ~ gender * position_cat + affirm_action,
## data = data, init.theta = 6.652451909, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8934 -0.7775 -0.1330 0.4896 3.8037
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.80898 0.05405 51.973 < 2e-16 ***
## genderM 0.11625 0.07453 1.560 0.11882
## position_catpostdoc 0.11708 0.10696 1.095 0.27372
## position_catprofessor 0.20059 0.10432 1.923 0.05449 .
## affirm_actionafter 0.19567 0.06316 3.098 0.00195 **
## genderM:position_catpostdoc -0.12030 0.14428 -0.834 0.40440
## genderM:position_catprofessor 0.22897 0.12861 1.780 0.07503 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(6.6525) family taken to be 1)
##
## Null deviance: 376.12 on 298 degrees of freedom
## Residual deviance: 303.02 on 292 degrees of freedom
## AIC: 2168.1
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 6.652
## Std. Err.: 0.708
##
## 2 x log-likelihood: -2152.067
performance::r2(mg11b)## # R2 for Generalized Linear Regression
## Nagelkerke's R2: 0.303
Average audience before affirmative actions:16.5929329
Average audience aftter affirmative actions: 20.179078
myg11b <- ggpredict(mg11b, terms=c("position_cat","gender", "affirm_action"))
as.data.frame(myg11b)%>% mutate(predicted = round(predicted,digits=0))## x predicted std.error conf.low conf.high group facet
## 1 student 17 0.05404728 14.92513 18.44711 F before
## 2 student 20 0.06992406 17.59470 23.14306 F after
## 3 student 19 0.05736112 16.65651 20.85622 M before
## 4 student 23 0.06847745 19.81983 25.92245 M after
## 5 postdoc 19 0.09474248 15.49256 22.46018 F before
## 6 postdoc 23 0.10566643 18.44179 27.90550 F after
## 7 postdoc 19 0.07992838 15.88458 21.72931 M before
## 8 postdoc 23 0.09796456 18.64668 27.37642 M after
## 9 professor 20 0.09257254 16.91385 24.31298 F before
## 10 professor 25 0.10078386 20.24096 30.04732 F after
## 11 professor 29 0.05126210 25.90194 31.66658 M before
## 12 professor 35 0.07476636 30.08179 40.32609 M after
Complete figure
prs <- as.data.frame(myg11b) %>% rename(affirm_action = facet)
ggplot(data, aes(x=position_cat, y=audience_n)) +
geom_point(aes(col=gender), position = position_dodge(0.6), alpha=0.3,
size=3,show.legend = F) +
facet_grid(~affirm_action,
labeller = as_labeller(c("before"="Before affirmative actions" , "after"="After affirmative actions" ))) +
#scale_color_manual(values = c("#b2abd2", "#fdb863")) +
scale_color_manual(values = c("#6D57CF","#FCA532")) +
scale_fill_manual(name="Gender", values = c("#6D57CF","#FCA532")) +
geom_pointrange(data=prs, aes(x=x, y=predicted,fill=group,
ymax=conf.high, ymin=conf.low), alpha=1,
position=position_dodge(0.6), size=1, shape=21, col="black") +
xlab("Academic position") + ylab("Audience (N)") ggsave("figures/audience_speakers.jpeg", width=8, height = 4) summary(mg10b)##
## Call:
## glm.nb(formula = audience_n ~ gender + position_cat + affirm_action,
## data = data, init.theta = 6.502702402, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8101 -0.7831 -0.1381 0.4704 3.9188
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.79293 0.04972 56.177 < 2e-16 ***
## genderM 0.15356 0.05463 2.811 0.00494 **
## position_catpostdoc 0.04418 0.07250 0.609 0.54229
## position_catprofessor 0.36571 0.06046 6.049 1.46e-09 ***
## affirm_actionafter 0.18918 0.06316 2.995 0.00274 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(6.5027) family taken to be 1)
##
## Null deviance: 369.75 on 298 degrees of freedom
## Residual deviance: 303.07 on 294 degrees of freedom
## AIC: 2169.3
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 6.503
## Std. Err.: 0.688
##
## 2 x log-likelihood: -2157.260
performance::r2(mg10b)## # R2 for Generalized Linear Regression
## Nagelkerke's R2: 0.282
myg10b <- ggpredict(mg10b, terms=c("position_cat","gender", "affirm_action"))
plot(myg10b) +
scale_color_manual(values = c("#b2abd2", "#fdb863"))Only professors - productivity metrics
Investigating if differences in productivity between male and female professors and researches are related to the audience.
Measured productivity publication metrics from Google Scholar for professors and researchers.
Creating productivity index using PCA 1st axis from metrics.
PCA productivity metrics
dp <- data %>% filter(!is.na(data$total_citation_n),
!is.na(data$nature_index_count))
table(dp$gender, dp$affirm_action)##
## before after
## F 14 6
## M 58 9
Productivity publication metrics
pca1 <- PCA(dp[, c(21:28)], graph=F)
p1 <- fviz_pca_biplot(pca1, col.ind = dp$gender, addEllipses=TRUE,
col.ind.sub="none", geom="point",
repel = TRUE) +
geom_vline(xintercept = 0, linetype="dashed") +
geom_hline(yintercept = 0, linetype="dashed")+
scale_color_manual(name="gender",values = c("#6D57CF","#FCA532"))+
scale_shape(name="gender")+
scale_fill_manual(name="gender",values = c("#6D57CF","#FCA532"))+
ggtitle("PCA biplot for professor's productivy metrics") +
xlab("PC1 (52%)") + ylab("PC2 (21%") +
theme_cowplot()
p1ggsave("figures/pca_biplot.jpeg", width=8, height = 7) Extracting PCA 2 first axes
dp$pc1 <- pca1$ind$coord[,1]
dp$pc2 <- pca1$ind$coord[,2]Modeling
OBS interna (apagar depois): podem perguntar porque a gente nao colocou o affirm_action no modelo jå que é importane para prever a audni6encia (como vimos no modelo anterior), eu até fiz uns testes, mas eu nao acho que devamos complicar este modelo, jå que o foco aqui é comaprar professores quanto à produtividade, com hipótese clara de que dada uma mesma produtivdade ainda assim o mulheres vão ter menor audiencia que homens. Então é bom deixar isso claro no texto - de que estamos nessa anålise focando apenas em genero e métrica de produtividade e por isso nao tem tempo nesses modelos.Eu também preferi colocar pc1 e pc2 sempre juntos como variåvel de produtividade - sem fazer modelos separados
m0 <- glm.nb(audience_n ~ 1, data=dp)
m1 <- glm.nb(audience_n ~ gender, data=dp)
m2 <- glm.nb(audience_n ~ pc1 + pc2, data=dp)
m3 <- glm.nb(audience_n ~ gender + pc1 + pc2, data=dp)
m4 <- glm.nb(audience_n ~ gender*pc1 + gender*pc2, data=dp)
AICtab(m0,m1,m2,m3,m4,
base=T, weights=T) %>% kable(digits=2)| AIC | dAIC | df | weight | |
|---|---|---|---|---|
| m3 | 693.36 | 0.00 | 5 | 0.55 |
| m2 | 695.53 | 2.17 | 4 | 0.19 |
| m4 | 695.74 | 2.38 | 7 | 0.17 |
| m1 | 697.16 | 3.80 | 3 | 0.08 |
| m0 | 700.73 | 7.37 | 2 | 0.01 |
Residual diagnostic
Best model
hnp(m3)## Negative binomial model (using MASS package)
plot(simulateResiduals(m3))Model results
summary(m3)##
## Call:
## glm.nb(formula = audience_n ~ gender + pc1 + pc2, data = dp,
## init.theta = 5.219733971, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5233 -0.7225 -0.2033 0.4610 3.3874
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.11594 0.11007 28.310 <2e-16 ***
## genderM 0.26225 0.12520 2.095 0.0362 *
## pc1 0.07478 0.02511 2.978 0.0029 **
## pc2 -0.01698 0.03941 -0.431 0.6665
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(5.2197) family taken to be 1)
##
## Null deviance: 103.589 on 86 degrees of freedom
## Residual deviance: 89.206 on 83 degrees of freedom
## AIC: 693.36
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 5.220
## Std. Err.: 0.927
##
## 2 x log-likelihood: -683.359
performance::r2(m3)## # R2 for Generalized Linear Regression
## Nagelkerke's R2: 0.219
my3 <- ggpredict(m3, terms=c("pc1","gender"))
my3 <- as.data.frame(my3)
ggplot(my3, aes(x=x, y=predicted, col=group)) +
geom_ribbon(aes(ymin=conf.low,ymax=conf.high, fill=group), alpha=0.3,
colour = NA) +
geom_line()+
scale_color_manual(name="Gender",values = c("#6D57CF","#FCA532"))+
scale_fill_manual(name="Gender",values = c("#6D57CF","#FCA532"))+
theme_cowplot() + ggtitle("") +
ylab("Audience (N)") + xlab("Productivity index (PC1 axis)")+
geom_point(data=dp, aes(x=pc1, y=audience_n, col=gender), alpha=0.6)ggsave("figures/audience_professor.jpeg", width=9, height = 6) PC 2 - nao importante, nao âsignificativoââ:
my3 <- ggpredict(m3, terms=c("pc2","gender"))
plot(my3) +
scale_color_manual(values = c("#b2abd2", "#fdb863"))+
scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
theme_cowplot()Figure audience
prs <- as.data.frame(myg11b) %>% rename(affirm_action = facet,
position_cat=x)
f1<- ggplot(data, aes(x=affirm_action, y=audience_n)) +
geom_point(aes(col=gender), position = position_dodge(0.6), alpha=0.3,
size=3,show.legend = F) +
facet_grid(~position_cat) +
#scale_color_manual(values = c("#b2abd2", "#fdb863")) +
scale_color_manual(values = c("#6D57CF","#FCA532")) +
scale_fill_manual(name="Gender", values = c("#6D57CF","#FCA532")) +
geom_pointrange(data=prs, aes(x=affirm_action, y=predicted,fill=group,
ymax=conf.high, ymin=conf.low), alpha=1,
position=position_dodge(0.6), size=1, shape=21, col="black") +
ylab("Audience (N)") +
xlab("Affirmative actions")+
labs(tag="A")
my3 <- ggpredict(m3, terms=c("pc1","gender"))
my3 <- as.data.frame(my3)
my3$prof <- "Professors only"
f2 <- ggplot(my3, aes(x=x, y=predicted, col=group)) +
geom_ribbon(aes(ymin=conf.low,ymax=conf.high, fill=group), alpha=0.3,
colour = NA) +
geom_line(size=1.5)+
facet_grid(~prof)+
scale_color_manual(name="Gender",values = c("#6D57CF","#FCA532"))+
scale_fill_manual(name="Gender",values = c("#6D57CF","#FCA532"))+
theme_cowplot() + ggtitle("") +
ylab("Audience (N)") + xlab("Productivity index (PC1 axis)")+
geom_point(data=dp, aes(x=pc1, y=audience_n, col=gender), alpha=0.6,
size=2)+
theme(legend.position="none") +
labs(tag="C")
f3<- plot_spacer()
design <- "
1111
#22#
"
f1 + f2 +
plot_layout( design=design, guides="collect")ggsave("figures/FIG_audience.jpeg", width=9, height = 8) p1<- fviz_pca_biplot(pca1, col.ind = dp$gender, addEllipses=TRUE,
col.ind.sub="none", geom="point",
repel = TRUE) +
facet_grid(.~.)+
geom_vline(xintercept = 0, linetype="dashed") +
geom_hline(yintercept = 0, linetype="dashed")+
scale_color_manual(name="gender",values = c("#6D57CF","#FCA532"))+
scale_shape(name="gender")+
scale_fill_manual(name="gender",values = c("#6D57CF","#FCA532"))+
labs(title="PCA professors' productivity", tag="B") +
xlab("PC1 (52%)") + ylab("PC2 (21%)") +
theme_cowplot() +
theme(legend.position="none",
plot.title = element_text(size=12, vjust=-5, hjust=0)) #+
#coord_cartesian(clip = "off")+
# scale_x_continuous(limits=c(-6,8), expand=c(0,0))+
# scale_y_continuous(limits=c(-4,8), breaks=c(-4,-2,0,2,4,6))
# annotate("rect", xmin=-6, xmax=8, ymin=7,ymax=8, fill="gray85")+
#annotate("text",label="Professors'productivity", x=0, y=7.5, fill="gray85")
f1/(p1+f2) +plot_layout(guides="collect")ggsave("figures/FIG_audience_test.jpeg", width=9, height = 8)